# Clear the workspace rm(list=ls()) # # ## ## ### SETTINGS ### ## ## # # # The path to where the data are stored pathtodata <- "G:\\a folder" # Set this variable # List of subject names subjectlist <- c("m1", "m4", "m5") # Set this variable # The measures we care about getting # Set this variable measures <- c("sF0", "strF0", "CPP", "H1H2c", "H1A1c", "H1A2c", "H1A3c") # The number of samples to remove from the beginning and end of each # Note: A typical number to use here is 12, since VoiceSauce samples use # a 25-ms window. But if your data have segments that are shorter than # (numtocut*2 + n_windows) samples, then there will be an error later in the script # when the program cuts off (numtocut*2) samples from the segment and then # tries to divide fewer than N samples into N windows. If that is the case, # set numtocut to a smaller number. (Setting numtocut to 0 will make nothing # be cut from the segments) numtocut <- 12 # Set this variable # The number of windows to normalize into # Set this variable n_windows <- 10 # # ## ## ### IMPORT THE DATA ### ## ## # # # Iterate through the subjects, loading each subject's data and combining them all # into one data frame for(Subject in subjectlist){ # Read the data from this subject # You may need to modify this command depending on your # file naming conventions subjdata <- read.table(file=paste(pathtodata, "\\", Subject, ".txt", sep=""), header=TRUE, sep="\t") # Add a column saying which subject this is subjdata$Subject <- Subject # Make Label a factor subjdata$Label <- factor(subjdata$Label) # Get duration of each segment subjdata$Duration <- subjdata$seg_End - subjdata$seg_Start ### The following conditional adds this subject's data to a data frame that includes all subjects # If there's already an all-subjects data frame... if("alldata" %in% ls()){ # ... then add this subject's data to it alldata <- rbind(alldata, subjdata) # Otherwise... } else { # ... create that data frame based on this subject's data alldata <- subjdata # end if } # end for } # Turn 0s into NAs, because a VoiceSauce output of 0 does not mean that measure was 0, # it just means VoiceSauce was unable to get that measure for that sample alldata[] <- lapply(alldata, function(alldata){replace(alldata, alldata == 0, NA)}) # # ## ## ### AVERAGE INTO N WINDOWS ### ## ## # # # The row numbers where the "seg_Start" values change. # Each of these indicates the first sample of a new segment. (This line also adds # one index at the end to indicate where the next word would start if there were # another word after the end; this is to get the duration of the final word in the # dataset.) notdup <- which( c( T, alldata[2:nrow(alldata),"seg_start"]!=alldata[1:(nrow(alldata)-1),"seg_start"], T) ) notdup <- c(which( !duplicated(alldata$seg_Start) ), nrow(alldata)+1) # Use those row numbers to calculate the length (in samples) of each segment. lengths <- notdup[-1] - notdup[-(length(notdup))] # The following big command calculates a "Window" column, which indicates for each # sample which window it will be averaged into (the 1st, 2nd, 3rd, 4th, or 5th # window of the segment). # Applies the following function to each word in the data... Window <- unlist( lapply( lengths, function(x) { # if the length rounds DOWN to a multiple of 5, do this... if( (x-(2*numtocut))%%n_windows < 3){ # Make a vector of window labels by concatenating the following.... c( # Remove the first (numtocut) samples from the analysis by # setting their window to NA rep(NA, numtocut), # Make a vector 1s through 5s up to a multiple of 5 rep(1:n_windows, each=round((x-(2*numtocut))/n_windows)), # fill in the remaining samples with 5s rep(n_windows, (x-(2*numtocut))%%n_windows), # Remove the last (numtocut) samples from the analysis by # setting their window to NA rep(NA, numtocut) # end c ) # if the length rounds UP to a multiple of 5, do this... } else { # Make a vector of window labels by concatenating the following.... c( # Remove the first (numtocut) samples from the analysis by # setting their window to NA rep(NA, numtocut), # Make a vector 1s through 5s up to a multiple of 5 rep(1:n_windows, each=round((x-(2*numtocut))/n_windows), length.out=(x-(2*numtocut))), # Remove the last (numtocut) samples from the analysis by # setting their window to NA rep(NA, numtocut) # end c ) # end if } # end function } # end lapply ) # end unlist ) # Add the Window column to the dataframe alldata <- data.frame(alldata, Window) # This line does the averaging over windows. windowdata <- aggregate( # We want an average for each measure specified in the "measure" variable # at the top of the script alldata[,measures], # This list defines what information will NOT be averaged across list( # Keep "Subject" (get a separate average for each subject) alldata$Subject, # Keep "seg_Start" (i.e., each word) (get a separate average for each word) alldata$seg_Start, # Keep the "Label" information corresponding to each segment alldata$Label, # Keep "Window" (get a separate average for each window) alldata$Window, # Keep the Duration information corresponding to each segment alldata$Duration), mean, # skip missing values na.rm=T # end aggregate ) # Just cleans up the column names in the new dataframe colnames(windowdata)[1:5] <- c("Subject", "seg_Start", "Label", "Window", "Duration") # # ## ## ### GENERAL CLEANUP ### ## ## # # # Remove whitespace from the beginnings of labels windowdata$Label <- sub("^\\s+", "", windowdata$Label) # Make important predictors into factors windowdata$Window <- factor(windowdata$Window) windowdata$Label <- factor(windowdata$Label) windowdata$Subject <- factor(windowdata$Subject)